{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Finding Outliers with k-Means\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_ip</th>\n",
       "      <th>username</th>\n",
       "      <th>success</th>\n",
       "      <th>failure_reason</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>datetime</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2018-01-01 00:06:19.353126</th>\n",
       "      <td>223.178.55.3</td>\n",
       "      <td>djones</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 00:09:07.147971</th>\n",
       "      <td>223.178.55.3</td>\n",
       "      <td>djones</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 01:08:08.610041</th>\n",
       "      <td>6.252.142.27</td>\n",
       "      <td>asmith</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 02:37:50.329298</th>\n",
       "      <td>124.178.25.98</td>\n",
       "      <td>akim</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2018-01-01 02:45:20.382080</th>\n",
       "      <td>98.43.141.103</td>\n",
       "      <td>akim</td>\n",
       "      <td>1</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                source_ip username  success failure_reason\n",
       "datetime                                                                  \n",
       "2018-01-01 00:06:19.353126   223.178.55.3   djones        1           None\n",
       "2018-01-01 00:09:07.147971   223.178.55.3   djones        1           None\n",
       "2018-01-01 01:08:08.610041   6.252.142.27   asmith        1           None\n",
       "2018-01-01 02:37:50.329298  124.178.25.98     akim        1           None\n",
       "2018-01-01 02:45:20.382080  98.43.141.103     akim        1           None"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import sqlite3\n",
    "\n",
    "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
    "    logs_2018 = pd.read_sql(\n",
    "        \"\"\"\n",
    "        SELECT * \n",
    "        FROM logs \n",
    "        WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n",
    "        \"\"\", \n",
    "        conn, parse_dates=['datetime'], index_col='datetime'\n",
    "    )\n",
    "logs_2018.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_X(log, day):\n",
    "    \"\"\"\n",
    "    Get data we can use for the X\n",
    "    \n",
    "    Parameters:\n",
    "        - log: The logs dataframe\n",
    "        - day: A day or single value we can use as a datetime index slice\n",
    "    \n",
    "    Returns: \n",
    "        A pandas DataFrame\n",
    "    \"\"\"\n",
    "    return pd.get_dummies(log[day].assign(\n",
    "        failures=lambda x: 1 - x.success\n",
    "    ).query('failures > 0').resample('1min').agg(\n",
    "        {'username':'nunique', 'failures': 'sum'}\n",
    "    ).dropna().rename(\n",
    "        columns={'username':'usernames_with_failures'}\n",
    "    ).assign(\n",
    "        day_of_week=lambda x: x.index.dayofweek, \n",
    "        hour=lambda x: x.index.hour\n",
    "    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n",
       "       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n",
       "       'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n",
       "       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n",
       "       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n",
       "       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = get_X(logs_2018, '2018')\n",
    "X.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## k-Means\n",
    "Since we want a \"normal\" activity cluster and an \"anomaly\" cluster, we need to make 2 clusters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  return self.partial_fit(X, y)\n",
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  return self.fit(X, **fit_params).transform(X)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "kmeans_pipeline = Pipeline([\n",
    "    ('scale', StandardScaler()),\n",
    "    ('kmeans', KMeans(random_state=0, n_clusters=2))\n",
    "]).fit(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The cluster label doesn't mean anything to us, but we can examine the size of each cluster. We don't expect the clusters to be of equal size because anomalous activity doesn't happen as often as normal activity (we presume)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
      "  Xt = transform.transform(Xt)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0    430546\n",
       "1     93600\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = kmeans_pipeline.predict(X)\n",
    "pd.Series(preds).value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluating the clustering\n",
    "#### Step 1: Get the true labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
    "    hackers_2018 = pd.read_sql(\n",
    "        'SELECT * FROM attacks WHERE start BETWEEN \"2018-01-01\" AND \"2019-01-01\";', \n",
    "        conn, parse_dates=['start', 'end']\n",
    "    ).assign(\n",
    "        duration=lambda x: x.end - x.start, \n",
    "        start_floor=lambda x: x.start.dt.floor('min'),\n",
    "        end_ceil=lambda x: x.end.dt.ceil('min')\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_y(datetimes, hackers, resolution='1min'):\n",
    "    \"\"\"\n",
    "    Get data we can use for the y (whether or not a hacker attempted a log in during that time).\n",
    "    \n",
    "    Parameters:\n",
    "        - datetimes: The datetimes to check for hackers\n",
    "        - hackers: The dataframe indicating when the attacks started and stopped\n",
    "        - resolution: The granularity of the datetime. Default is 1 minute.\n",
    "        \n",
    "    Returns:\n",
    "        A pandas Series of booleans.\n",
    "    \"\"\"\n",
    "    date_ranges = hackers.apply(\n",
    "        lambda x: pd.date_range(x.start_floor, x.end_ceil, freq=resolution), \n",
    "        axis=1\n",
    "    )\n",
    "    dates = pd.Series()\n",
    "    for date_range in date_ranges:\n",
    "        dates = pd.concat([dates, date_range.to_series()])\n",
    "    return datetimes.isin(dates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "is_hacker = get_y(X.reset_index().datetime, hackers_2018)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2: Calculate Fowlkes Mallows Score\n",
    "This indicates percentage of the observations belong to the same cluster in the true labels and in the predicted labels."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8395916262911648"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import fowlkes_mallows_score\n",
    "\n",
    "fowlkes_mallows_score(is_hacker, preds)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
